##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Rows: 73100 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): Store ID, Product ID, Category, Region, Weather Condition, Seasona...
## dbl (8): Inventory Level, Units Sold, Units Ordered, Demand Forecast, Price...
## date (1): Date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Daily time series
daily_sum <- store.data %>%
group_by(Date) %>%
summarise(
Total_Units_Sold = sum(`Units Sold`, na.rm = TRUE)
) %>%
ungroup()# Aggregate daily to weekly time series
weekly.store.ts <- ts(
daily_sum$Total_Units_Sold,
start = c(2022, 1),
end = c(2024, 1),
frequency = 52
)
autoplot(weekly.store.ts) + xlab("Time") + ylab("Total Units Sold")
Conclusion: The data does not appear to be a random walk.
In a random walk, the lag 1 autocorrelation should be very high (close to 1). In our plot, the lag 1 ACF is weak, suggesting the data is not following a typical random walk pattern.
A random walk shows a gradual decline in ACF across lags. Our ACF values quickly fall near zero, indicating a stationary process rather than a random walk.
Most lags fall within the blue confidence bounds, suggesting no strong autocorrelation. If the data were a random walk, significant autocorrelations would persist across many lags.
## Warning in adf.test(weekly.store.ts): p-value smaller than printed p-value
##
## Augmented Dickey-Fuller Test
##
## data: weekly.store.ts
## Dickey-Fuller = -5, Lag order = 4, p-value = 0.01
## alternative hypothesis: stationary
Since our p value is below o.05 we reject the null hypothesis, so our original data is already stationary.
##
## Call:
## arima(x = weekly.store.ts, order = c(1, 0, 0))
##
## Coefficients:
## ar1 intercept
## 0.027 13735.8
## s.e. 0.097 96.7
##
## sigma^2 estimated as 930106: log likelihood = -870, aic = 1747
Double checking using AR(1) model fit, we can see that our ar1 value is 0.027, which is a very week autocorrelation.
# the forecast is equal for all future values and it is equal to the average of the historical (training) data.
average.forecast <- meanf(train.ts, h = nValid)
average.forecast## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 2023.635 13735 12435 15034 11734 15736
## 2023.654 13735 12435 15034 11734 15736
## 2023.673 13735 12435 15034 11734 15736
## 2023.692 13735 12435 15034 11734 15736
## 2023.712 13735 12435 15034 11734 15736
## 2023.731 13735 12435 15034 11734 15736
## 2023.750 13735 12435 15034 11734 15736
## 2023.769 13735 12435 15034 11734 15736
## 2023.788 13735 12435 15034 11734 15736
## 2023.808 13735 12435 15034 11734 15736
## 2023.827 13735 12435 15034 11734 15736
## 2023.846 13735 12435 15034 11734 15736
## 2023.865 13735 12435 15034 11734 15736
## 2023.885 13735 12435 15034 11734 15736
## 2023.904 13735 12435 15034 11734 15736
## 2023.923 13735 12435 15034 11734 15736
## 2023.942 13735 12435 15034 11734 15736
## 2023.962 13735 12435 15034 11734 15736
## 2023.981 13735 12435 15034 11734 15736
## 2024.000 13735 12435 15034 11734 15736
# all the forecasts are just the last observation
naive.forecast <- naive(train.ts, h = nValid)
naive.forecast## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 2023.635 14666 12925 16407 12003 17329
## 2023.654 14666 12203 17129 10900 18432
## 2023.673 14666 11650 17682 10053 19279
## 2023.692 14666 11183 18149 9339 19993
## 2023.712 14666 10772 18560 8711 20621
## 2023.731 14666 10400 18932 8142 21190
## 2023.750 14666 10059 19273 7620 21712
## 2023.769 14666 9741 19591 7133 22199
## 2023.788 14666 9442 19890 6676 22656
## 2023.808 14666 9159 20173 6244 23088
## 2023.827 14666 8890 20442 5833 23499
## 2023.846 14666 8634 20698 5440 23892
## 2023.865 14666 8387 20945 5063 24269
## 2023.885 14666 8150 21182 4701 24631
## 2023.904 14666 7922 21410 4351 24981
## 2023.923 14666 7700 21632 4013 25319
## 2023.942 14666 7486 21846 3685 25647
## 2023.962 14666 7278 22054 3367 25965
## 2023.981 14666 7075 22257 3057 26275
## 2024.000 14666 6878 22454 2756 26576
# This is similar to drawina a line between first and last observation and extrapolating it into the future
drift.forecast <- rwf(train.ts, h = nValid, drift=TRUE)
drift.forecast## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 2023.635 14668 12906 16430 11973 17363
## 2023.654 14670 12163 17177 10836 18504
## 2023.673 14672 11584 17761 9950 19395
## 2023.692 14675 11088 18261 9190 20159
## 2023.712 14677 10645 18709 8510 20844
## 2023.731 14679 10237 19121 7886 21472
## 2023.750 14681 9857 19505 7303 22059
## 2023.769 14683 9498 19869 6753 22614
## 2023.788 14686 9155 20216 6228 23143
## 2023.808 14688 8827 20548 5725 23650
## 2023.827 14690 8511 20869 5240 24140
## 2023.846 14692 8204 21180 4770 24614
## 2023.865 14694 7907 21482 4313 25075
## 2023.885 14696 7616 21776 3868 25524
## 2023.904 14698 7333 22064 3433 25964
## 2023.923 14701 7055 22346 3007 26394
## 2023.942 14703 6782 22623 2590 26816
## 2023.962 14705 6515 22895 2179 27231
## 2023.981 14707 6251 23163 1775 27639
## 2024.000 14709 5992 23427 1377 28042
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 2023.635 12774 11045 14503 10130 15418
## 2023.654 12904 11175 14633 10260 15548
## 2023.673 15101 13372 16830 12457 17745
## 2023.692 13927 12198 15656 11283 16571
## 2023.712 13623 11894 15352 10979 16267
## 2023.731 14104 12375 15833 11460 16748
## 2023.750 13753 12024 15482 11109 16397
## 2023.769 14412 12683 16141 11768 17056
## 2023.788 15670 13941 17399 13026 18314
## 2023.808 13849 12120 15578 11205 16493
## 2023.827 10977 9248 12706 8333 13621
## 2023.846 13780 12051 15509 11136 16424
## 2023.865 13976 12247 15705 11332 16620
## 2023.885 16195 14466 17924 13551 18839
## 2023.904 13613 11884 15342 10969 16257
## 2023.923 15041 13312 16770 12397 17685
## 2023.942 12144 10415 13873 9500 14788
## 2023.962 13536 11807 15265 10892 16180
## 2023.981 12796 11067 14525 10152 15440
## 2024.000 14517 12788 16246 11873 17161
autoplot(train.ts) +
autolayer(average.forecast, series = "Average", PI = FALSE) +
autolayer(naive.forecast, series = "Naive", PI = FALSE) +
autolayer(drift.forecast, series = "Drift", PI = FALSE) +
autolayer(seasonal.naive.forecast, series = "Seasonal naive", PI = FALSE) +
autolayer(valid.ts, series = "Observed") +
xlab("Time") + ylab("Total Units Sold") +
guides(color = guide_legend(title = "Forecast"))## ME RMSE MAE MPE MAPE ACF1 Theil's U
## Test set 5.05 827 685 -0.324 4.99 -0.302 0.614
## ME RMSE MAE MPE MAPE ACF1 Theil's U
## Test set -926 1242 1040 -7.13 7.87 -0.302 0.944
## ME RMSE MAE MPE MAPE ACF1 Theil's U
## Test set -949 1259 1059 -7.29 8.01 -0.302 0.957
## ME RMSE MAE MPE MAPE ACF1 Theil's U
## Test set -94.9 1175 985 -0.882 7.17 -0.0989 0.839
linear_mod <- tslm(train.ts ~ trend)
linear_mod_pred <- forecast(linear_mod, h= nValid, level = 95)
autoplot(linear_mod_pred) +
autolayer(linear_mod_pred$mean, series ="Linear Forecast") +
autolayer(linear_mod$fitted.values, series = "Linear Fit") +
autolayer(valid.ts, series = "Observed") +
xlab("Time") + ylab("Total Units Sold") +
guides(color = guide_legend(title = ""))# We explore exponential trend regression-based time series forecasting if y_t follows exponential trend, then log(y_t) follows a linear trend
exp_mod <- tslm(train.ts ~ trend , lambda = 0) #lambda = 0 indicates exponential trend
exp_mod_pred <- forecast(exp_mod, h= nValid, level=0)
autoplot(exp_mod_pred) +
autolayer(exp_mod_pred$mean, series="Exponential Forecast") +
autolayer(linear_mod_pred$mean, series = "Linear Forecast") +
autolayer(valid.ts, series = "Observed") +
guides(color = guide_legend(title = ""))# Now, we build a regression-based model that captures quadratic trend
quad_mod <- tslm(train.ts ~ trend + I(trend^2))
quad_mod_pred <- forecast(quad_mod, h= nValid, level=0)
autoplot(quad_mod_pred) +
autolayer(quad_mod_pred$mean, series ="Quadratic Forecast") +
autolayer(exp_mod_pred$mean, series="Exponential Forecast") +
autolayer(linear_mod_pred$mean, series = "Linear Forecast") +
autolayer(valid.ts, series = "Observed") +
autolayer(quad_mod$fitted.values, series = "Quadratic Fit") +
guides(color = guide_legend(title = ""))# Now, we build a regression-based model that capture both trend and seasonality.
linear_season_mod <- tslm(train.ts ~ trend + season)
linear_season_mod_pred <- forecast(linear_season_mod, h= nValid, level = 0)
autoplot(linear_season_mod_pred) +
autolayer(linear_season_mod_pred$mean, series = "Linear and Seasonality Forecast") +
#autolayer(season_mod_pred$mean, series ="Seasonality Forecast") +
#autolayer(quad_mod_pred$mean, series ="Quadratic Forecast") +
#autolayer(exp_mod_pred$mean, series="Exponential Forecast") +
#autolayer(linear_mod_pred$mean, series = "Linear Forecast") +
autolayer(valid.ts, series = "Observed") +
#autolayer(linear_season_mod$fitted.values, series = "Seasonality Fit") +
guides(color = guide_legend(title = ""))# We now consider poly_season_mod that capture seasonality and polynomial trend
poly_season_mod <- tslm(train.ts ~ trend + I(trend^2)+ season)
poly_season_mod_pred <- forecast(poly_season_mod, h= nValid, level = 0)
autoplot(poly_season_mod_pred) +
autolayer(poly_season_mod_pred$mean, series = "Quadratic and Seasonality Forecast") +
autolayer(linear_season_mod_pred$mean, series = "Linear and Seasonality Forecast") +
#autolayer(season_mod_pred$mean, series ="Seasonality Forecast") +
#autolayer(quad_mod_pred$mean, series ="Quadratic Forecast") +
#autolayer(exp_mod_pred$mean, series="Exponential Forecast") +
#autolayer(linear_mod_pred$mean, series = "Linear Forecast") +
autolayer(valid.ts, series = "Observed") +
#autolayer(poly_season_mod$fitted.values, series = "Quadratic and Seasonality Fit") +
guides(color = guide_legend(title = ""))## ME RMSE MAE MPE MAPE ACF1 Theil's U
## Test set -177 846 718 -1.65 5.3 -0.302 0.632
## ME RMSE MAE MPE MAPE ACF1 Theil's U
## Test set -155 842 714 -1.49 5.26 -0.302 0.629
## ME RMSE MAE MPE MAPE ACF1 Theil's U
## Test set 67.8 829 681 0.135 4.94 -0.297 0.617
## ME RMSE MAE MPE MAPE ACF1 Theil's U
## Test set -361 1229 1095 -2.83 8.01 -0.101 0.898
## ME RMSE MAE MPE MAPE ACF1 Theil's U
## Test set 61.8 1181 945 0.264 6.86 -0.0858 0.845
library(zoo)
ma.trailing <- rollmean(weekly.store.ts, k = 12, align = "right")
ma.centered <- ma(weekly.store.ts, order = 12)
autoplot(weekly.store.ts)+
autolayer(ma.trailing, series = "Trailing Moving Average") +
autolayer(ma.centered, series = "Centered Moving Average") +
xlab("Time") + ylab("Total Units Sold") +
ggtitle("Total Units Sold with moving average w=12")## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_line()`).
ma.trailing <- rollmean(train.ts, k = 12, align = "right")
last.ma <- tail(ma.trailing, 1)
ma.trailing.pred <- ts(rep(last.ma, nValid), start = c(2022, nTrain + 1),
end = c(2022, nTrain + nValid), freq = 52)
autoplot(weekly.store.ts)+
autolayer(ma.trailing, series = "Moving Average") +
autolayer(ma.trailing.pred, series = "MA Forecast")lag1.diff <- diff(weekly.store.ts, lag = 1)
lag12.diff <- diff(weekly.store.ts, lag = 12)
diff.twice.ts <- diff(diff(weekly.store.ts, lag = 12), lag = 1)
diff.nValid <- 20
diff.nTrain <- length(diff.twice.ts) - diff.nValid
diff.train.ts <- window(diff.twice.ts, start = c(2022, 1), end = c(2022, diff.nTrain + 1))## Warning in window.default(x, ...): 'start' value not changed
diff.valid.ts <- window(diff.twice.ts, start = c(2022, diff.nTrain + 2), end = c(2022, diff.nTrain + 1 + diff.nValid))
# ETS with Additive noise (A) and no trend (N) and no seasonality (N) is SES
ses <- ets(diff.train.ts, model = "ANN", alpha = 0.2)
ses.pred <- forecast(ses, h = diff.nValid, level = 90)
# You also have dedicated function that can do this
# ses.pred <- ses(diff.train.ts, h = diff.nValid, alpha = 0.2)
autoplot(ses.pred) +
autolayer(ses.pred$fitted, series = "Fitted") +
ylab("Total Units Sold") + xlab("Time")We perform forecasting with Holt’s method (only trend) and compare with regression model with only trend.
# model "AAN" correspondes to additive error (A), additive trend (A) and no seasonality (N)
holt.mod <- ets(train.ts, model = "AAN")
holt.pred <- forecast(holt.mod, h = nValid, level = 0)
# Plot two models: Holt method and Polynomial Trend
autoplot(holt.pred) +
autolayer(holt.pred$mean, series = "Holt's method") +
autolayer(quad_mod_pred$mean, series = "Regression (trend)") +
autolayer(valid.ts, series = "Observed")# MMA means multiplicative error, additive trend, and additive seasonality.
# hwin <- ets(train.ts, model = "MAA")
# hwin.pred <- forecast(hwin, h = nValid, level = 0)
#
# autoplot(hwin.pred) +
# autolayer(hwin.pred$mean, series = "Holt-Winter Forecast")+
# autolayer(valid.ts, series = "Observed")
# This model does not work because frequency is too high (freq = 52). we can either remove trend (model = 'ANN') or reduce frequency (aggregate to monthly, freq = 12)Automation might not always be good. That is because the automated method chooses the best model fit for training data and may not do that well for validation data.
## ME RMSE MAE MPE MAPE ACF1 Theil's U
## Test set 171 844 689 0.889 4.96 -0.302 0.624
## ME RMSE MAE MPE MAPE ACF1 Theil's U
## Test set 13715 13749 13715 100 100 -0.482 8.24
## ME RMSE MAE MPE MAPE ACF1 Theil's U
## Test set -174 846 717 -1.63 5.29 -0.302 0.632
# Basic ARIMA Model: (order = (1,1,1))
p <- 1; d <- 1; q <- 1
train.arima <- Arima(train.ts, order = c(p, d, q))
arima.forecast <- forecast(train.arima, h = nValid)
autoplot(arima.forecast) +
autolayer(valid.ts, series = "Observed") +
ggtitle("ARIMA Forecast (1,1,1)")train.auto.arima.no.seas <- auto.arima(train.ts, seasonal = FALSE)
arima.auto.forecast.no.seas <- forecast(train.auto.arima.no.seas, h = nValid)
autoplot(arima.auto.forecast.no.seas) +
autolayer(valid.ts, series = "Observed") +
ggtitle("Auto ARIMA Forecast (No Seasonality)")## Auto ARIMA with seasonality using seasonal dummies
# Extract seasonal factors from the training set as a factor
season <- as.factor(cycle(train.ts))
# Convert the seasonal factor to dummy variables and remove one dummy to avoid collinearity
xreg_train <- model.matrix(~ season)[, -1]
# Fit the auto.arima model with the seasonal dummy variables as external regressors
train.auto.arima <- auto.arima(train.ts, xreg = xreg_train)
# Generate seasonal factors for the forecast period (ensure levels match the training set)
future_season <- factor(rep(1:frequency(train.ts), length.out = nValid), levels = levels(season))
xreg_future <- model.matrix(~ future_season)[, -1]
# Forecast using the ARIMA model with seasonal dummy variables
arima.auto.forecast <- forecast(train.auto.arima, xreg = xreg_future, h = nValid)## Warning in forecast.forecast_ARIMA(train.auto.arima, xreg = xreg_future, : xreg
## contains different column names from the xreg used in training. Please check
## that the regressors are in the same order.
# Plot the forecast along with the observed values
autoplot(arima.auto.forecast) +
autolayer(valid.ts, series = "Observed") +
ggtitle("Auto ARIMA Forecast (With Seasonality)")# Create regressors for the training data:
# - 'trend_reg' is the time index,
# - 'trend2_reg' is its square,
# - and we convert the seasonal factor into dummy variables,
# setting levels explicitly and dropping one column to avoid collinearity.
trend_reg <- as.numeric(time(train.ts))
trend2_reg <- trend_reg^2
# Set factor levels explicitly based on the frequency of the training series
season_reg <- factor(cycle(train.ts), levels = as.character(1:frequency(train.ts)))
dummy_train <- model.matrix(~ season_reg)[, -1] # Remove one dummy column
# Combine regressors into a single matrix for training
xreg_train <- cbind(trend_reg, trend2_reg, dummy_train)
# Fit the ARIMA model using the regressors (with seasonal = FALSE because seasonality is modeled via dummies)
improved_poly_season_arima <- auto.arima(train.ts, xreg = xreg_train, seasonal = FALSE)
# Generate future regressors for the forecast period:
future_trend <- as.numeric(time(valid.ts))
future_trend2 <- future_trend^2
# Set the same factor levels for the future seasonal variable
future_season <- factor(cycle(valid.ts), levels = levels(season_reg))
dummy_future <- model.matrix(~ future_season)[, -1] # Drop one column as before
# Combine future regressors
xreg_future <- cbind(future_trend, future_trend2, dummy_future)
# Forecast using the fitted ARIMA model with the future regressors
forecast_improved <- forecast(improved_poly_season_arima, xreg = xreg_future, h = nValid)## Warning in forecast.forecast_ARIMA(improved_poly_season_arima, xreg =
## xreg_future, : xreg contains different column names from the xreg used in
## training. Please check that the regressors are in the same order.
# Plot the forecast along with the observed validation data
autoplot(forecast_improved) +
autolayer(valid.ts, series = "Observed") +
ggtitle("ARIMA with Regressors (Improved Polynomial with Seasonality)")## ME RMSE MAE MPE MAPE ACF1 Theil's U
## Test set -0.875 823 682 -0.366 4.97 -0.298 0.615
## ME RMSE MAE MPE MAPE ACF1 Theil's U
## Test set 5.05 827 685 -0.324 4.99 -0.302 0.614
## ME RMSE MAE MPE MAPE ACF1 Theil's U
## Test set -116 987 830 -1.19 6.04 -0.107 0.755
## ME RMSE MAE MPE MAPE ACF1 Theil's U
## Test set -361 1229 1095 -2.83 8.01 -0.101 0.898
# Set a random seed for reproducibility
set.seed(123)
# Create consistent seasonal dummy variables for both training and validation
season_train <- factor(cycle(train.ts), levels = as.character(1:frequency(train.ts)))
season_dummy_train <- model.matrix(~ season_train)[, -1]
season_valid <- factor(cycle(valid.ts), levels = as.character(1:frequency(train.ts)))
season_dummy_valid <- model.matrix(~ season_valid)[, -1]
# Fit the neural network model with adjusted parameters:
# size = 15: a larger hidden layer to capture more complex patterns
# repeats = 50: more repeated trainings to stabilize the results
# decay = 0.001: smaller decay (weaker regularization) to reduce over-smoothing
nn_model_season <- nnetar(
y = train.ts,
xreg = season_dummy_train,
size = 15, # Increased number of hidden nodes
repeats = 50, # Keep multiple repeats for stability
decay = 0.001 # Reduced regularization
)
# Forecast using the fitted model and matching seasonal dummy variables for validation
nn_forecast_season <- forecast(
nn_model_season,
xreg = season_dummy_valid,
h = nValid
)## Warning in forecast.nnetar(nn_model_season, xreg = season_dummy_valid, h =
## nValid): xreg contains different column names from the xreg used in training.
## Please check that the regressors are in the same order.
# Plot the forecast alongside the observed values in the validation set
autoplot(nn_forecast_season) +
autolayer(valid.ts, series = "Observed") +
xlab("Time") +
ylab("Total Units Sold") +
ggtitle("Neural Network Forecast with Seasonal Dummies (Increased Size, Reduced Decay)") +
guides(color = guide_legend(title = "Series"))# Make sure each of these is a valid forecast object with a $mean component
model1.forecast <- arima.forecast # ARIMA(1,1,1)
model2.forecast <- quad_mod_pred # Quadratic Trend
model3.forecast <- ma.trailing.pred # Moving Average
model4.forecast <- average.forecast # Average
model5.forecast <- arima.auto.forecast.no.seas # Auto ARIMA (No Seas)# Suppose you have 5 forecast objects, each returned as a data frame
# with columns: "Point Forecast", "Lo 80", "Hi 80", "Lo 95", "Hi 95"
# We'll extract the "Point Forecast" column from each to get numeric vectors.
# 1) Extract numeric vectors for each forecast's point predictions
model1_vector <- model1.forecast$mean # ARIMA
model2_vector <- model2.forecast$mean # Quadratic Trend
model3_vector <- model3.forecast # Moving Average (already numeric)
model4_vector <- model4.forecast$mean # Average Method
model5_vector <- model5.forecast$mean
# 2) Verify that each vector is numeric and has the same length
# You can check with str(model1_vector), length(model1_vector), etc.
# 3) Compute a simple average of these 5 numeric vectors
num.models <- 5
comb.simple.avg <- (
model1_vector +
model2_vector +
model3_vector +
model4_vector +
model5_vector
) / num.models
# 4) Plot the training set, the averaged forecast, and the observed validation data
# Adjust the code to match your actual variable names for train.ts and valid.ts
autoplot(train.ts) +
autolayer(comb.simple.avg, series = "Simple Avg Comb") +
autolayer(valid.ts, series = "Observed") +
ggtitle("Combination of Top 5 Models - Simple Average (All Numeric)")# Put each model's forecasted mean into a data frame
forecast.vectors.df <- data.frame(
model1 = as.numeric(model1.forecast$mean),
model2 = as.numeric(model2.forecast$mean),
model3 = as.numeric(model3.forecast),
model4 = as.numeric(model4.forecast$mean),
model5 = as.numeric(model5.forecast$mean)
)
# Compute a 20% trimmed mean for each forecast horizon
# With 5 models, this effectively removes the highest & lowest value each time step
forecast.vectors.df$comb.trimmed.avg <- apply(
forecast.vectors.df,
1,
function(x) mean(x, trim = 0.2)
)
# Convert trimmed mean to a ts object (adjust start/frequency to your validation range)
comb.trimmed.avg <- ts(
forecast.vectors.df$comb.trimmed.avg,
start = c(2023, 34), # Example
frequency = 52 # Example for weekly data
)
# Plot: training set, trimmed average combo, and observed validation
autoplot(train.ts) +
autolayer(comb.trimmed.avg, series = "Trimmed Avg Comb") +
autolayer(valid.ts, series = "Observed") +
ggtitle("Combination of Top 5 Models - Trimmed Mean")# 1. Create a data frame of the 5 model forecasts
forecast.vectors.df <- data.frame(
model1 = as.numeric(model1.forecast$mean),
model2 = as.numeric(model2.forecast$mean),
model3 = as.numeric(model3.forecast),
model4 = as.numeric(model4.forecast$mean),
model5 = as.numeric(model5.forecast$mean)
)
# 2. Add the validation data as the dependent variable
forecast.vectors.df$valid <- as.numeric(valid.ts)
# 3. Fit a linear model to find the best weights for each forecast
forecasts.lm <- lm(valid ~ model1 + model2 + model3 + model4 + model5, data = forecast.vectors.df)
summary(forecasts.lm)##
## Call:
## lm(formula = valid ~ model1 + model2 + model3 + model4 + model5,
## data = forecast.vectors.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1280.0 -475.0 -66.7 575.7 1694.1
##
## Coefficients: (3 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.60e+05 1.60e+05 -1.00 0.33
## model1 1.37e+01 1.26e+01 1.09 0.29
## model2 -1.10e+00 4.36e+00 -0.25 0.80
## model3 NA NA NA NA
## model4 NA NA NA NA
## model5 NA NA NA NA
##
## Residual standard error: 866 on 17 degrees of freedom
## Multiple R-squared: 0.0672, Adjusted R-squared: -0.0426
## F-statistic: 0.612 on 2 and 17 DF, p-value: 0.554
# 4. Convert fitted values into a ts object (regression-based combo)
comb.regression <- ts(
forecasts.lm$fitted.values,
start = c(2023, 34), # Example
frequency = 52 # Example for weekly data
)
# 5. Plot the training set, regression-based combo, and observed validation
autoplot(train.ts) +
autolayer(comb.regression, series = "Regression-Based Comb") +
autolayer(valid.ts, series = "Observed") +
ggtitle("Combination of Top 5 Models - Regression Fit")# Compute accuracy for each model
acc_avg <- accuracy(average.forecast$mean, valid.ts)
acc_naive <- accuracy(naive.forecast$mean, valid.ts)
acc_drift <- accuracy(drift.forecast$mean, valid.ts)
acc_snaive <- accuracy(seasonal.naive.forecast$mean, valid.ts)
acc_linear <- accuracy(linear_mod_pred$mean, valid.ts)
acc_exp <- accuracy(exp_mod_pred$mean, valid.ts)
acc_quad <- accuracy(quad_mod_pred$mean, valid.ts)
acc_lin_seas <- accuracy(linear_season_mod_pred$mean, valid.ts)
acc_poly_seas <- accuracy(poly_season_mod_pred$mean, valid.ts)
acc_ma <- accuracy(ma.trailing.pred, valid.ts)
acc_ses <- accuracy(ses.pred$mean, valid.ts)
acc_holt <- accuracy(holt.pred$mean, valid.ts)
acc_arima <- accuracy(arima.forecast$mean, valid.ts)
acc_arima_ns <- accuracy(arima.auto.forecast.no.seas$mean, valid.ts)
acc_arima_s <- accuracy(arima.auto.forecast$mean, valid.ts)
acc_improved <- accuracy(forecast_improved$mean, valid.ts)
acc_nn <- accuracy(nn_forecast_season$mean, valid.ts)
acc_combsim <- accuracy(comb.simple.avg, valid.ts)
acc_combtrim <- accuracy(comb.trimmed.avg, valid.ts)
acc_combre <- accuracy(comb.regression, valid.ts)
# 2. Create a data frame of all the accuracy metrics of interest
acc_table <- data.frame(
Model = c("Average",
"Naive",
"Drift",
"Seasonal Naive",
"Linear Trend",
"Exponential Trend",
"Quadratic Trend",
"Linear+Season",
"Poly+Season",
"Moving Average",
"SES",
"Holt",
"ARIMA(1,1,1)",
"Auto ARIMA (No Seas)",
"Auto ARIMA (Seas)",
"ARIMA + Regressors",
"NN + Seasonal Dummies",
"Forecast Combination simple",
"Forecast Combination trim",
"Forecast Combination regression"),
ME = c(acc_avg[1, "ME"],
acc_naive[1, "ME"],
acc_drift[1, "ME"],
acc_snaive[1, "ME"],
acc_linear[1, "ME"],
acc_exp[1, "ME"],
acc_quad[1, "ME"],
acc_lin_seas[1, "ME"],
acc_poly_seas[1, "ME"],
acc_ma[1, "ME"],
acc_ses[1, "ME"],
acc_holt[1, "ME"],
acc_arima[1, "ME"],
acc_arima_ns[1, "ME"],
acc_arima_s[1, "ME"],
acc_improved[1, "ME"],
acc_nn[1, "ME"],
acc_combsim[1, "ME"],
acc_combtrim[1, "ME"],
acc_combre[1, "ME"]),
RMSE = c(acc_avg[1, "RMSE"],
acc_naive[1, "RMSE"],
acc_drift[1, "RMSE"],
acc_snaive[1, "RMSE"],
acc_linear[1, "RMSE"],
acc_exp[1, "RMSE"],
acc_quad[1, "RMSE"],
acc_lin_seas[1, "RMSE"],
acc_poly_seas[1, "RMSE"],
acc_ma[1, "RMSE"],
acc_ses[1, "RMSE"],
acc_holt[1, "RMSE"],
acc_arima[1, "RMSE"],
acc_arima_ns[1, "RMSE"],
acc_arima_s[1, "RMSE"],
acc_improved[1, "RMSE"],
acc_nn[1, "RMSE"],
acc_combsim[1, "RMSE"],
acc_combtrim[1, "RMSE"],
acc_combre[1, "RMSE"]),
MAE = c(acc_avg[1, "MAE"],
acc_naive[1, "MAE"],
acc_drift[1, "MAE"],
acc_snaive[1, "MAE"],
acc_linear[1, "MAE"],
acc_exp[1, "MAE"],
acc_quad[1, "MAE"],
acc_lin_seas[1, "MAE"],
acc_poly_seas[1, "MAE"],
acc_ma[1, "MAE"],
acc_ses[1, "MAE"],
acc_holt[1, "MAE"],
acc_arima[1, "MAE"],
acc_arima_ns[1, "MAE"],
acc_arima_s[1, "MAE"],
acc_improved[1, "MAE"],
acc_nn[1, "MAE"],
acc_combsim[1, "MAE"],
acc_combtrim[1, "MAE"],
acc_combre[1, "MAE"]),
MPE = c(acc_avg[1, "MPE"],
acc_naive[1, "MPE"],
acc_drift[1, "MPE"],
acc_snaive[1, "MPE"],
acc_linear[1, "MPE"],
acc_exp[1, "MPE"],
acc_quad[1, "MPE"],
acc_lin_seas[1, "MPE"],
acc_poly_seas[1, "MPE"],
acc_ma[1, "MPE"],
acc_ses[1, "MPE"],
acc_holt[1, "MPE"],
acc_arima[1, "MPE"],
acc_arima_ns[1, "MPE"],
acc_arima_s[1, "MPE"],
acc_improved[1, "MPE"],
acc_nn[1, "MPE"],
acc_combsim[1, "MPE"],
acc_combtrim[1, "MPE"],
acc_combre[1, "MPE"]),
MAPE = c(acc_avg[1, "MAPE"],
acc_naive[1, "MAPE"],
acc_drift[1, "MAPE"],
acc_snaive[1, "MAPE"],
acc_linear[1, "MAPE"],
acc_exp[1, "MAPE"],
acc_quad[1, "MAPE"],
acc_lin_seas[1, "MAPE"],
acc_poly_seas[1, "MAPE"],
acc_ma[1, "MAPE"],
acc_ses[1, "MAPE"],
acc_holt[1, "MAPE"],
acc_arima[1, "MAPE"],
acc_arima_ns[1, "MAPE"],
acc_arima_s[1, "MAPE"],
acc_improved[1, "MAPE"],
acc_nn[1, "MAPE"],
acc_combsim[1, "MAPE"],
acc_combtrim[1, "MAPE"],
acc_combre[1, "MAPE"]))
# 3. (Optional) Print the data frame
acc_table ## Model ME RMSE MAE MPE MAPE
## 1 Average 5.05e+00 827 685 -0.32423 4.99
## 2 Naive -9.26e+02 1242 1040 -7.12726 7.87
## 3 Drift -9.49e+02 1259 1059 -7.29354 8.01
## 4 Seasonal Naive -9.49e+01 1175 985 -0.88198 7.17
## 5 Linear Trend -1.77e+02 846 718 -1.65445 5.30
## 6 Exponential Trend -1.55e+02 842 714 -1.49072 5.26
## 7 Quadratic Trend 6.78e+01 829 681 0.13499 4.94
## 8 Linear+Season -3.61e+02 1229 1095 -2.82630 8.01
## 9 Poly+Season 6.18e+01 1181 945 0.26356 6.86
## 10 Moving Average 1.71e+02 844 689 0.88878 4.96
## 11 SES 1.37e+04 13749 13715 100.27475 100.27
## 12 Holt -1.74e+02 846 717 -1.63075 5.29
## 13 ARIMA(1,1,1) -8.75e-01 823 682 -0.36575 4.97
## 14 Auto ARIMA (No Seas) 5.05e+00 827 685 -0.32423 4.99
## 15 Auto ARIMA (Seas) -1.16e+02 987 830 -1.18910 6.04
## 16 ARIMA + Regressors -3.61e+02 1229 1095 -2.82701 8.01
## 17 NN + Seasonal Dummies -1.43e+02 822 718 -1.39142 5.28
## 18 Forecast Combination simple 4.96e+01 827 678 0.00191 4.93
## 19 Forecast Combination trim 2.60e+01 827 680 -0.17116 4.95
## 20 Forecast Combination regression -1.39e-13 798 640 -0.33729 4.68
# 4. (Optional) If you're in an R Markdown, you can nicely display it as a table
library(knitr)
kable(acc_table, digits = 3, caption = "Accuracy Comparison of All Models")| Model | ME | RMSE | MAE | MPE | MAPE |
|---|---|---|---|---|---|
| Average | 5.053 | 827 | 685 | -0.324 | 4.99 |
| Naive | -926.300 | 1242 | 1040 | -7.127 | 7.88 |
| Drift | -949.050 | 1259 | 1059 | -7.294 | 8.01 |
| Seasonal Naive | -94.900 | 1175 | 985 | -0.882 | 7.17 |
| Linear Trend | -177.035 | 846 | 718 | -1.654 | 5.30 |
| Exponential Trend | -154.619 | 842 | 714 | -1.491 | 5.26 |
| Quadratic Trend | 67.843 | 829 | 681 | 0.135 | 4.94 |
| Linear+Season | -361.072 | 1229 | 1095 | -2.826 | 8.01 |
| Poly+Season | 61.849 | 1181 | 945 | 0.264 | 6.86 |
| Moving Average | 171.117 | 844 | 689 | 0.889 | 4.96 |
| SES | 13715.396 | 13749 | 13715 | 100.275 | 100.28 |
| Holt | -173.792 | 846 | 717 | -1.631 | 5.29 |
| ARIMA(1,1,1) | -0.875 | 823 | 682 | -0.366 | 4.97 |
| Auto ARIMA (No Seas) | 5.053 | 827 | 685 | -0.324 | 4.99 |
| Auto ARIMA (Seas) | -115.950 | 987 | 830 | -1.189 | 6.04 |
| ARIMA + Regressors | -361.169 | 1229 | 1095 | -2.827 | 8.01 |
| NN + Seasonal Dummies | -142.647 | 822 | 718 | -1.391 | 5.28 |
| Forecast Combination simple | 49.638 | 827 | 678 | 0.002 | 4.93 |
| Forecast Combination trim | 25.983 | 827 | 680 | -0.171 | 4.95 |
| Forecast Combination regression | 0.000 | 798 | 640 | -0.337 | 4.67 |
library(dplyr)
acc_table_ranked <- acc_table %>%
arrange(MAPE) %>% # sort by ascending MAPE
mutate(
Rank = row_number() # add Rank column
)
# Reset row names if you like
row.names(acc_table_ranked) <- NULL
acc_table_ranked## Model ME RMSE MAE MPE MAPE Rank
## 1 Forecast Combination regression -1.39e-13 798 640 -0.33729 4.68 1
## 2 Forecast Combination simple 4.96e+01 827 678 0.00191 4.93 2
## 3 Quadratic Trend 6.78e+01 829 681 0.13499 4.94 3
## 4 Forecast Combination trim 2.60e+01 827 680 -0.17116 4.95 4
## 5 Moving Average 1.71e+02 844 689 0.88878 4.96 5
## 6 ARIMA(1,1,1) -8.75e-01 823 682 -0.36575 4.97 6
## 7 Average 5.05e+00 827 685 -0.32423 4.99 7
## 8 Auto ARIMA (No Seas) 5.05e+00 827 685 -0.32423 4.99 8
## 9 Exponential Trend -1.55e+02 842 714 -1.49072 5.26 9
## 10 NN + Seasonal Dummies -1.43e+02 822 718 -1.39142 5.28 10
## 11 Holt -1.74e+02 846 717 -1.63075 5.29 11
## 12 Linear Trend -1.77e+02 846 718 -1.65445 5.30 12
## 13 Auto ARIMA (Seas) -1.16e+02 987 830 -1.18910 6.04 13
## 14 Poly+Season 6.18e+01 1181 945 0.26356 6.86 14
## 15 Seasonal Naive -9.49e+01 1175 985 -0.88198 7.17 15
## 16 Naive -9.26e+02 1242 1040 -7.12726 7.87 16
## 17 Linear+Season -3.61e+02 1229 1095 -2.82630 8.01 17
## 18 ARIMA + Regressors -3.61e+02 1229 1095 -2.82701 8.01 18
## 19 Drift -9.49e+02 1259 1059 -7.29354 8.01 19
## 20 SES 1.37e+04 13749 13715 100.27475 100.27 20
# (Optional) If you're in an R Markdown, you can nicely display it as a table
library(knitr)
kable(acc_table_ranked, digits = 3, caption = "Accuracy Comparison of All Models")| Model | ME | RMSE | MAE | MPE | MAPE | Rank |
|---|---|---|---|---|---|---|
| Forecast Combination regression | 0.000 | 798 | 640 | -0.337 | 4.67 | 1 |
| Forecast Combination simple | 49.638 | 827 | 678 | 0.002 | 4.93 | 2 |
| Quadratic Trend | 67.843 | 829 | 681 | 0.135 | 4.94 | 3 |
| Forecast Combination trim | 25.983 | 827 | 680 | -0.171 | 4.95 | 4 |
| Moving Average | 171.117 | 844 | 689 | 0.889 | 4.96 | 5 |
| ARIMA(1,1,1) | -0.875 | 823 | 682 | -0.366 | 4.97 | 6 |
| Average | 5.053 | 827 | 685 | -0.324 | 4.99 | 7 |
| Auto ARIMA (No Seas) | 5.053 | 827 | 685 | -0.324 | 4.99 | 8 |
| Exponential Trend | -154.619 | 842 | 714 | -1.491 | 5.26 | 9 |
| NN + Seasonal Dummies | -142.647 | 822 | 718 | -1.391 | 5.28 | 10 |
| Holt | -173.792 | 846 | 717 | -1.631 | 5.29 | 11 |
| Linear Trend | -177.035 | 846 | 718 | -1.654 | 5.30 | 12 |
| Auto ARIMA (Seas) | -115.950 | 987 | 830 | -1.189 | 6.04 | 13 |
| Poly+Season | 61.849 | 1181 | 945 | 0.264 | 6.86 | 14 |
| Seasonal Naive | -94.900 | 1175 | 985 | -0.882 | 7.17 | 15 |
| Naive | -926.300 | 1242 | 1040 | -7.127 | 7.88 | 16 |
| Linear+Season | -361.072 | 1229 | 1095 | -2.826 | 8.01 | 17 |
| ARIMA + Regressors | -361.169 | 1229 | 1095 | -2.827 | 8.01 | 18 |
| Drift | -949.050 | 1259 | 1059 | -7.294 | 8.01 | 19 |
| SES | 13715.396 | 13749 | 13715 | 100.275 | 100.28 | 20 |
p <- 1; d <- 1; q <- 1
whole.arima <- Arima(weekly.store.ts, order = c(p, d, q))
arima.forecast1 <- forecast(whole.arima, h = 20)# ARIMA(1,1,1)
quad_mod1 <- tslm(weekly.store.ts ~ trend + I(trend^2))
quad_mod_pred1 <- forecast(quad_mod1, h= 20, level=0) # Quadratic Trend
ma.trailing1 <- rollmean(weekly.store.ts, k = 12, align = "right")
last.ma1 <- tail(ma.trailing1, 1)
ma.trailing.pred1 <- ts(rep(last.ma1, 20), start = c(2024, 2),
freq = 52) # Moving Average
average.forecast1 <- meanf(weekly.store.ts, h = nValid) # Average
train.auto.arima.no.seas1 <- auto.arima(weekly.store.ts, seasonal = FALSE)
arima.auto.forecast.no.seas1 <- forecast(train.auto.arima.no.seas1, h = 20)# 1. Create a data frame of the 5 model forecasts
forecast.vectors.df <- data.frame(
model1 = as.numeric(arima.forecast1$mean),
model2 = as.numeric(quad_mod_pred1$mean),
model3 = as.numeric(ma.trailing.pred1),
model4 = as.numeric(average.forecast1$mean),
model5 = as.numeric(arima.auto.forecast.no.seas1$mean)
)
# 2. Add the validation data as the dependent variable
# forecast.vectors.df$weekly <- as.numeric(weekly.store.ts)
forecast.vectors.df$actual <- rowMeans(forecast.vectors.df[, c("model1", "model2", "model3", "model4", "model5")])
# 3. Fit a linear model to find the best weights for each forecast
forecasts.lm <- lm(actual ~ model1 + model2 + model3 + model4 + model5, data = forecast.vectors.df)
summary(forecasts.lm)## Warning in summary.lm(forecasts.lm): essentially perfect fit: summary may be
## unreliable
##
## Call:
## lm(formula = actual ~ model1 + model2 + model3 + model4 + model5,
## data = forecast.vectors.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.98e-13 -6.04e-13 5.53e-14 4.52e-13 1.19e-12
##
## Coefficients: (3 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.25e+03 3.75e-09 2.20e+12 <2e-16 ***
## model1 2.00e-01 2.72e-13 7.35e+11 <2e-16 ***
## model2 2.00e-01 2.83e-15 7.07e+13 <2e-16 ***
## model3 NA NA NA NA
## model4 NA NA NA NA
## model5 NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.56e-13 on 17 degrees of freedom
## Multiple R-squared: 1, Adjusted R-squared: 1
## F-statistic: 2.87e+27 on 2 and 17 DF, p-value: <2e-16
# 4. Convert fitted values into a ts object (regression-based combo)
comb.regression <- ts(
forecasts.lm$fitted.values,
start = c(2024, 2), # Example
frequency = 52 # Example for weekly data
)
# 5. Plot the training set, regression-based combo, and observed validation
autoplot(weekly.store.ts) +
autolayer(comb.regression, series = "Regression-Based Comb") +
autolayer(valid.ts, series = "Observed") +
ggtitle("Combination of Top 5 Models - Regression Fit")